Population visualization¶

In [1]:
import numpy as np #linear algebra
import pandas as pd #data manipulation and analysis
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.path as mpath
import plotly.express as px
import plotly.graph_objects as go
import textwrap
from urllib.request import urlopen
import json

Explain what will be happening here¶

In [2]:
# read-in dataframe with cluster ids
cluster2_df = pd.read_csv("popClusterData.csv", index_col=0)
cluster2_df.head()
Out[2]:
Population EstimatesJuly 1 2021() Persons under 5 years Persons under 18 years Persons 65 years and over Female persons White alone Black or African American alone American Indian and Alaska Native alone Asian alone Native Hawaiian and Other Pacific Islander alone Two or More Races Hispanic or Latino White alonenot Hispanic or Latino Veterans2017-2021 Foreign born persons2017-2021 cluster_id Banned or not County Name
0 27542 1707.60 6665.16 5095.27 13798.54 26660.66 165.25 165.25 82.63 0.00 440.67 302.96 26385.24 1840 192.79 0 0.0 Adams County, Ohio
1 101670 6100.20 23587.44 18503.94 50021.64 84081.09 12912.09 305.01 915.03 0.00 3456.78 3761.79 81132.66 6098 1830.06 0 1.0 Allen County, Ohio
2 52316 2929.70 11666.47 10201.62 26524.21 50484.94 470.84 156.95 418.53 52.32 784.74 837.06 49752.52 3076 784.74 0 0.0 Ashland County, Ohio
3 97337 5450.87 21414.14 19467.40 47889.80 90231.40 3796.14 389.35 486.68 97.34 2433.43 4672.18 86240.58 7158 1557.39 0 0.0 Ashtabula County, Ohio
4 62056 2296.07 8998.12 8874.01 31090.06 56657.13 1799.62 248.22 1737.57 62.06 1613.46 1241.12 55726.29 3255 2544.30 0 0.0 Athens County, Ohio

All counties in each cluster¶

In [3]:
from functionsAll import split_dataframe_by_cluster, get_cluster_coords_dict,coords, clusterk_dict,\
cluster0, cluster1, cluster2
In [4]:
# 1
df_list = split_dataframe_by_cluster(cluster2_df, 'cluster_id')

# 2
coords_list = coords(df_list, cluster2_df)

# 3
clusterK_dict = clusterk_dict(df_list, coords_list)
In [5]:
for i in range(len(df_list)):
    if len(df_list) ==1:
        cluster0_dict = cluster0(clusterK_dict)
    elif len(df_list) ==2:
        cluster0_dict = cluster0(clusterK_dict)
        cluster1_dict = cluster1(clusterK_dict)
    elif len(df_list) ==3:
        cluster0_dict = cluster0(clusterK_dict)
        cluster1_dict = cluster1(clusterK_dict)
        cluster2_dict = cluster2(clusterK_dict)
        
    print(f"cluster{i}_dict have been made to dictionary")
        
        
cluster0_dict have been made to dictionary
cluster1_dict have been made to dictionary
cluster2_dict have been made to dictionary

Banned counties in each cluster¶

In [6]:
from functionsBanned import filter_banned_counties, get_banned_cluster_coords_dict, bannedCoords, clusterk_dict_banned,\
cluster0Banned, cluster1Banned, cluster2Banned
In [7]:
# 1
banned_counties_df = filter_banned_counties(cluster2_df)

# 2
bannedCoords_list = bannedCoords(banned_counties_df, cluster2_df)

# 3
clusterKBanned_dict = clusterk_dict_banned(banned_counties_df, bannedCoords_list)
Cluster0 had enough banned counties to find non-banned counties in the banned counties convex hull.

Now to look at the counties inside the convex hull of the banned counties¶

In [8]:
from countyName import banned_counties_list, countyNames_cluster0, countyNames_cluster1, countyNames_cluster2, merge_dicts
In [9]:
bannedCountiesList=banned_counties_list(cluster2_df)


# Make into function
for i in range(len(banned_counties_df)):
    if len(banned_counties_df) ==1:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        allCounties=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)

    elif len(banned_counties_df) ==2:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
        
        countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
        countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
        
        allCounties = merge_dicts(countyName0, countyName1)

    elif len(banned_counties_df) ==3:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
        cluster2_banned_dict = cluster2Banned(clusterKBanned_dict)
        
        countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
        countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
        countyName2=countyNames_cluster0(cluster2_df, cluster2_dict, cluster2_banned_dict, bannedCountiesList)
        
    print(f"cluster{i}_banned_dict have been made to dictionary")
        
cluster0_banned_dict have been made to dictionary

Anything above here is good

Plotting Column Pairs¶

In [11]:
from topVariableFunctions import filtered_var_pairs, categoryCountyList, subcategoryCountyList, commonKeys, freq_var

filt = filtered_var_pairs(allCounties)

categorycountyList = categoryCountyList(filt)

subcategorycountyList = subcategoryCountyList(filt)


combinedVars = commonKeys(categorycountyList, subcategorycountyList)
The category with the most counties is 'Black or African American alone' with 28 counties.
The subcategory with the most counties is 'Two or More Races' with 28 counties.

After all variables are combined, we can now take a look at some of the counties that may be at risk the most common variable amoung them

Table of Demographic varibales and the counties that are in it the most¶

In [12]:
sorted_var_dict = dict(sorted(combinedVars.items(), key=lambda x: len(x[1]), reverse=True))
In [13]:
# Make ranking table of vars.
ranked_listTbl = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(', '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_dict.items())]
topVarTbl = pd.DataFrame(ranked_listTbl)
topVarTbl.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
In [14]:
# try to chnage lengh of cloumn
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'


fig = go.Figure(data=[go.Table(
    columnwidth = [50,110,75,700],

    header=dict(
        values=['Rank', 'Demographic Variable', 'Number of Counties', 'List of Counties'],
                    line_color='darkslategray',
                    fill_color=headerColor,
                    align=['left','center'],
                    font=dict(color='white', size=12)
    ),
    cells=dict(
        values=[topVarTbl.Rank, topVarTbl.Demographic_variable, topVarTbl.Number_of_counties, topVarTbl.List_of_Counties],
               line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
    align = ['center', 'center', 'center', 'left'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()

Plotting counties frequcy in each variable¶

Checking to see which counties had the most demographic variables. Then showing a table of what they are

In [ ]:
# # Make ranking table of vars.
ranked_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_dict.items())]
newdf2 = pd.DataFrame(ranked_list)
newdf2.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']

Transforming newdf2 to become tidy:¶

In [ ]:
# Separate the list of counties into individual rows
newdf2 = newdf2.explode('List_of_Counties')

# Drop the duplicate columns
newdf2 = newdf2.drop_duplicates(subset=['Rank', 'List_of_Counties'])

# Rename the columns for clarity
newdf3 = newdf2.rename(columns={
    'Demographic_variable': 'Demographic_Variable',
    'Number_of_counties': 'Number_of_Counties',
    'List_of_Counties': 'County'
})

# Display the updated data frame
newdf3.head()
In [ ]:
# Melting OG data:
melted_df = pd.melt(cluster2_df, id_vars=['County Name'], var_name='Attribute', value_name='Value')
melted_df['County Name'] = melted_df['County Name'].apply(lambda x: x[:-13])
melted_df
In [ ]:
tidy_df = pd.merge(newdf3, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
                   right_on=['County Name', 'Attribute'])
tidy_df.drop(['Attribute','County Name'], axis=1, inplace=True)
tidy_df.head()
In [ ]:
# create bar chart trace
fig = px.bar(tidy_df, x=tidy_df.Demographic_Variable, y=tidy_df.Number_of_Counties, text='Value', color='County',
             labels={"Attribute": "Attribute",
                     "Value": "Attribute Value"
                 },
             title="Default behavior: some text is tiny",)

fig.update_traces(textposition='inside')
#  update the layout to adjust the size of the plot
fig.update_layout(
    width=1200,  # set the width of the plot to 800 pixels
    height=1400,  # set the height of the plot to 600 pixels
)

# display the plot
fig.show()

Plotting counties frequcy in each variable¶

In [ ]:
freq_var =freq_var(ranked_list)
In [ ]:
# var_freq to dictionary
var_freq_dict = {item[0]: item[1] for i, item in enumerate(freq_var)}
In [ ]:
# sorting var_freq_dict dictionary
sorted_var_freq_dict = dict(sorted(var_freq_dict.items(), key=lambda x: len(x[1]), reverse=True))
In [ ]:
# Make ranking table of vars.
ranked_var_freqViz = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(' '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdfViz = pd.DataFrame(ranked_var_freqViz)
top_var_freqdfViz.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
In [ ]:
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'


fig = go.Figure(data=[go.Table(
    columnwidth = [40,70,100,600],

    header=dict(
        values=['Rank', 'County Name', 'Number of Demographic Variables', 'List of Demographics'],
                    line_color='darkslategray',
                    fill_color=headerColor,
                    align=['center','center', 'center', 'left'],
                    font=dict(color='white', size=12)
    ),
    cells=dict(
        values=[top_var_freqdfViz.Rank, top_var_freqdfViz.County, top_var_freqdfViz.Number_of_variables,
                top_var_freqdfViz.List_of_variables],
               line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
    align = ['center', 'center', 'center', 'left'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()

Tidying for plot¶

In [ ]:
# # Make ranking table of vars.
ranked_vars_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdf = pd.DataFrame(ranked_vars_list)
top_var_freqdf.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
In [ ]:
# Separate the list of counties into individual rows
test2 = top_var_freqdf.explode('List_of_variables')

# Drop the duplicate columns
test2 = test2.drop_duplicates(subset=['Rank', 'List_of_variables'])

# Rename the columns for clarity
test5 = test2.rename(columns={
    'List_of_variables': 'Demographic_Variable',
    'Number_of_variables': 'Number_of_variables',
    'County': 'County'
})

# Display the updated data frame
# test5.head()
In [ ]:
tidy_df2 = pd.merge(test5, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
                   right_on=['County Name', 'Attribute'])
In [ ]:
# create bar chart trace
fig = px.bar(tidy_df2, x=tidy_df2.County, color='Demographic_Variable',text='Value',
             labels={"Attribute": "Attribute",
                     "Value": "Attribute Value",
                     "Demographic_Variable":"Demographic Var."
                 },
             title="Default behavior: some text is tiny",)

fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
    width=1500,  # set the width of the plot to 800 pixels
    height=500,  # set the height of the plot to 600 pixels
)

# display the plot
fig.show()

Most frequent Demographic variables¶

In [ ]:
## Most common variables
res = sum(sorted_var_freq_dict.values(), [])
mostCommonVar = list(set(res))
mostCommonVar

Most Frequent Counties¶

In [ ]:
mostCommonCounties = top_var_freqdfViz['County'].tolist()
print(mostCommonCounties)

Plotting the counties at risk on a map of Ohio¶

In [ ]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv',
                   dtype={"fips": str})

# Filter the data frame to include only Ohio
df_ohio = df[df['STNAME'] == 'Ohio']

# Filter the counties GeoJSON file to include only Ohio counties
counties_ohio = {'type': 'FeatureCollection', 'features': []}
for feature in counties['features']:
    if feature['id'][:2] == '39':
        counties_ohio['features'].append(feature)
In [ ]:
# Make new data frame with the most common vars
cluster2_df['TempCounty Name'] = cluster2_df['County Name'].apply(lambda x: x[:-6])
In [ ]:
# merge the two data frames based on the 'County Name' and 'CTYNAME' columns
merged_df = pd.merge(cluster2_df, df_ohio[['CTYNAME', 'FIPS']], left_on='TempCounty Name', right_on='CTYNAME')

# drop the duplicate 'CTYNAME' column
merged_df.drop('CTYNAME', axis=1, inplace=True)
merged_df.drop('TempCounty Name', axis=1, inplace=True)
# print the merged data frame
merged_df.head()
In [ ]:
merged_df['TempCounty Name'] = merged_df['County Name'].apply(lambda x: x[:-13])
In [ ]:
# Example list of counties to check
counties_to_check = merged_df['TempCounty Name'].tolist()

# Create a list to hold safe counties
safe_counties = []

# Check each county and append to the safe_counties list if not in either of the two lists
for county in counties_to_check:
    if county not in mostCommonCounties and county not in bannedCountiesList:
        safe_counties.append(county)

# Print the list of safe counties
print(safe_counties)
print(len(safe_counties))
In [ ]:
merged_df['risk'] = merged_df['TempCounty Name'].apply(lambda x: 'At risk' if x in mostCommonCounties else
                                                      'Already has Banned' if x in bannedCountiesList else
                                                      'Safe' if x in safe_counties else merged_df[merged_df['TempCounty Name'] == x]['risk'].values[0])
In [ ]:
merged_df.head()
In [ ]:
fig = px.choropleth(merged_df, geojson=counties_ohio, locations='FIPS', color='risk', text='
                    color_continuous_scale="Viridis",
                    range_color=(0, 2),
                    scope="usa",
                    hover_data=["County Name"],
                    labels={'risk':'Risk Level'})
                          
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

On Hold¶

In [ ]:
# # Create an empty list to store the filtered key-value pairs
filtered_pairs = []

# Loop through each key-value pair in the dictionary
for key, value in countyName0.items():
    category, subcategory = key
    
    # Otherwise, add the key-value pair to the filtered list
    filtered_pairs.append({'Category':category, 'Subcategory':subcategory, 'Number of Counties': len(value)})

# Convert the filtered list to a DataFrame
df = pd.DataFrame(filtered_pairs)



# Print the DataFrame
dfSorted = df.sort_values(by='Number of Counties', ascending=False)
dfSorted.head()
In [ ]:
## Plot 
In [ ]:
plt.figure(figsize=(10, 6))

# Create the bar plot
plt.bar(dfSorted['Category'], dfSorted['Number of Counties'])

# Rotate the x-axis labels to make them easier to read
plt.xticks(rotation=90)

# Set the title and axis labels
plt.title('Frequency of Housing Data Categories', fontsize=14)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Count', fontsize=12)


# Show the plot
plt.show()
In [ ]:
plt.figure(figsize=(10, 6))

# Create the bar plot
plt.bar(dfSorted['Subcategory'], dfSorted['Number of Counties'])

# Rotate the x-axis labels to make them easier to read
plt.xticks(rotation=90)

# Set the title and axis labels
plt.title('Frequency of Housing Data Subcategories', fontsize=14)
plt.xlabel('Subategory', fontsize=12)
plt.ylabel('Count', fontsize=12)


# Show the plot
plt.show()
In [ ]:
# Step 2: Calculate frequency of each county across all tuple keys KEEP
county_freq = {}
for counties in countyName0.values():
    for county in counties:
        if county in county_freq:
            county_freq[county] += 1
        else:
            county_freq[county] = 1
In [ ]:
# county_freq
In [ ]:
# Step 4: Rank the counties based on frequency counts KEEP
ranked_counties = sorted(county_freq, key=lambda county: county_freq[county], reverse=True)
In [ ]:
import plotly.graph_objs as go

# create bar chart trace
trace = go.Bar(
    x=list(county_freq.keys()),
    y=list(county_freq.values())
)

# create layout
layout = go.Layout(
    title='Frequency of Counties that show up in the convex hull of banned counties',
    xaxis=dict(
        title='County',
        tickangle=90,
        tickfont=dict(size=10)
    ),
    yaxis=dict(
        title='Frequency'
    ),
    height=600,
    width=1000
)

# create figure
fig = go.Figure(data=[trace], layout=layout)

# show plot
fig.show()

Unique Counties: From here down is good¶

In [ ]:
values = set(val for lst in countyName0.values() for val in lst)
unique_counties = list(values)
print(unique_counties)

Most common vars¶

In [ ]:
# Create an empty dictionary to store the number of counties for each column pair
counties= []
# Loop through each key-value pair in the dictionary
for key, value in countyName0.items():
    category, subcategory = key
    counties.append(category)
    counties.append(subcategory)
    
In [ ]:
most_common_keys = []
for elem in counties:
    if elem not in most_common_keys:
        most_common_keys.append(elem)
In [ ]:
most_common_keys

Testing things¶

Want to find the counties that pop up the most along with there respected variables

In [ ]:
# goodish: Need to remove all duplates demographic variables from dictionary values
county_demographics = {}

for demographic_stats, county_list in countyName0.items():
    demographic_variable1, demographic_variable2 = demographic_stats  # unpack the two variables from the demographic stats pair
    
    for county in county_list:
        if county not in county_demographics:
            county_demographics[county] = [demographic_variable1]
        else:
            if demographic_variable1 not in county_demographics[county]:
                county_demographics[county].append(demographic_variable1)
            if demographic_variable2 not in county_demographics[county]:
                county_demographics[county].append(demographic_variable2)
In [ ]:
# county_demographics

Counties that all share the same variables¶

In [ ]:
# goodish: need to remove duplates from dictionary values 
shared_vars = {}

for county, vars in county_demographics.items():
    shared_vars[county] = [var for var in vars if any(var in vars2 for county2, vars2 in county_demographics.items() if county2 != county)]
In [ ]:
# shared_vars
In [ ]:
# Create an empty dictionary to store the updated values
updated_dict = {}

# Iterate through each key-value pair in the original dictionary
for key, value in shared_vars.items():
    # Use a set to remove duplicates from the value list
    unique_values = list(set(value))
    
    # Assign the unique values to the corresponding key in the updated dictionary
    updated_dict[key] = unique_values

# Print the updated dictionary
# updated_dict
In [ ]:
def are_dicts_equal(dict1, dict2):
    if len(dict1) != len(dict2):
        return False

    dict1_sorted = dict(sorted(dict1.items()))
    dict2_sorted = dict(sorted(dict2.items()))

    return dict1_sorted == dict2_sorted
In [ ]:
are_dicts_equal(shared_vars,county_demographics)
In [ ]:
def are_dicts_equal(dict1, dict2, dict3):
    if len(dict1) != len(dict2) or len(dict1) != len(dict3):
        return False

    dict1_sorted = dict(sorted(dict1.items()))
    dict2_sorted = dict(sorted(dict2.items()))
    dict3_sorted = dict(sorted(dict3.items()))

    return dict1_sorted == dict2_sorted == dict3_sorted
In [ ]:
are_dicts_equal(county_demographics, shared_vars, updated_dict)